Load the functions
source("thesis_functions.R")
dhist.plot <- function(d, v, vname){
ggplot(d, aes(x=v)) +
geom_histogram(aes(y = stat(density)), bins = 30, fill = "white", col = "black") +
stat_function(
fun = dnorm,
args = list(mean = mean(d$v), sd = sd(d$v)),
lwd = 1,
col = 'red'
) +
# xlab(vname) + ylab("Density") +
# ggtitle(paste(vname, "Distribution")) +
theme_bw()
}
set levels of ordinal data, and other variables to their appropriate dataypes. Did not adjust the datatypes for date/time since those will not be used in this portion
data <- fread("kpopdata.csv")
data <- mutate(data, ArtistType = as.factor(ArtistType),
ArtistGender = as.factor(ArtistGender),
ArtistGen = factor(ArtistGen),
release_date = as.POSIXct(release_date, format = "%m/%d/%y"),
key = factor(key, levels = 0:11),
mode = as.factor(mode),
time_signature = factor(time_signature, levels = c(1,3,4,5)))
#make month/year as continuous data in the form of number of months (~360)
ref.year = 1992 #we are using January 1992 as the reference
data$months = 12*(year(data$release_date) - ref.year) + month(data$release_date)
#View(select(data, Artist, song_name, release_date, months))
check out distribution of the months
#dhist.plot(data, months, "Month Released")
ggplot(data, aes(x=months)) +
geom_histogram(aes(y = stat(density)), bins = 30, fill = "white", col = "black") +
stat_function(
fun = dnorm,
args = list(mean = mean(data$months), sd = sd(data$months)),
lwd = 1,
col = 'red'
) +
xlab("Month Released") + ylab("Density") +
ggtitle(paste("Month Released", "Distribution")) +
theme_bw()
summary(data$months)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.0 241.0 285.0 266.3 317.0 349.0
try to find a transformation to normalize. As we can see the distribution of months is severely skewed to the left.
A common distribution towards normality for moderately skewed data is : \(log((max(y)+1) - y)\), since the max number of months is 349, the transformation would be : \(log(350 - y)\)
ggplot(data, aes(x=log(350-data$months))) +
geom_histogram(aes(y = stat(density)), bins = 30, fill = "white", col = "black") +
stat_function(
fun = dnorm,
args = list(mean = mean(log(350-data$months)), sd = sd(log(350-data$months))),
lwd = 1,
col = 'red'
) +
xlab("Month Released : log(350 - y))") + ylab("Density") +
ggtitle(paste("Month Released : log(350 - y))", "Distribution")) +
theme_bw()
summary(data$months)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.0 241.0 285.0 266.3 317.0 349.0
Another common transformation for moderate skewed data is a square root approach: \(\sqrt{((max(y)+1) - y)}\) = \(\sqrt{(350 - y)}\)
ggplot(data, aes(x=sqrt(350-data$months))) +
geom_histogram(aes(y = stat(density)), bins = 30, fill = "white", col = "black") +
stat_function(
fun = dnorm,
args = list(mean = mean(sqrt(350-data$months)), sd = sd(sqrt(350-data$months))),
lwd = 1,
col = 'red'
) +
xlab("Month Released : sqrt(350 - y))") + ylab("Density") +
ggtitle(paste("Month Released : sqrt(350 - y))", "Distribution")) +
theme_bw()
summary(data$months)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.0 241.0 285.0 266.3 317.0 349.0
Lastly, a common transformation for severely negatively skewed data takes an inverse approach: \(\frac{1}{(max(y)+1) - y)}\) = \(\frac{1}{(350 - y)}\)
ggplot(data, aes(x=1/(350-data$months))) +
geom_histogram(aes(y = stat(density)), bins = 30, fill = "white", col = "black") +
stat_function(
fun = dnorm,
args = list(mean = mean(1/(350-data$months)), sd = sd(1/(350-data$months))),
lwd = 1,
col = 'red'
) +
xlab("Month Released : 1/(350 - y))") + ylab("Density") +
ggtitle(paste("Month Released : 1/(350 - y))", "Distribution")) +
theme_bw()
summary(data$months)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.0 241.0 285.0 266.3 317.0 349.0
Create Training (75%) and Test data (25%) to train classification models on.
kpop <- dplyr::select(data, months, popularity, duration, acousticness, danceability, energy, instrumentalness, key, loudness, mode, speechiness, tempo, valence)
kpop0 <- kpop %>% dplyr::filter(mode == 0) %>% dplyr::select(-mode)
kpop1 <- kpop %>% dplyr::filter(mode == 1) %>% dplyr::select(-mode)
### Kpop mode 0 train and test
smpl.size0 <- floor(0.75*nrow(kpop0))
set.seed(123)
smpl0 <- sample(nrow(kpop0), smpl.size0, replace = FALSE)
train0 <- kpop0[smpl0,]
test0 <- kpop0[-smpl0,]
### Kpop mode 1 train and test
smpl.size1 <- floor(0.75*nrow(kpop1))
set.seed(123)
smpl1 <- sample(nrow(kpop1), smpl.size1, replace = FALSE)
train1 <- kpop1[smpl1,]
test1 <- kpop1[-smpl1,]
ml0 <- lm(months ~. , data = train0)
summary(ml0)
##
## Call:
## lm(formula = months ~ ., data = train0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -220.905 -29.308 6.643 37.204 199.560
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 506.91441 16.88344 30.024 < 2e-16 ***
## popularity 1.60481 0.05435 29.528 < 2e-16 ***
## duration -20.26179 1.81712 -11.151 < 2e-16 ***
## acousticness 31.41991 6.58842 4.769 1.93e-06 ***
## danceability -61.87134 10.53878 -5.871 4.74e-09 ***
## energy -98.67824 10.85541 -9.090 < 2e-16 ***
## instrumentalness 69.35253 14.06277 4.932 8.54e-07 ***
## key1 -3.38890 4.81002 -0.705 0.481137
## key2 -8.25574 5.95496 -1.386 0.165724
## key3 -4.68988 6.66026 -0.704 0.481381
## key4 -16.64198 4.72042 -3.526 0.000428 ***
## key5 -6.37892 4.57708 -1.394 0.163508
## key6 -7.69000 4.77927 -1.609 0.107700
## key7 -3.91007 5.23715 -0.747 0.455354
## key8 -1.77216 5.65747 -0.313 0.754115
## key9 -16.31280 4.85082 -3.363 0.000780 ***
## key10 -13.05634 4.99295 -2.615 0.008962 **
## key11 -13.43026 4.45880 -3.012 0.002613 **
## loudness 14.12750 0.67833 20.827 < 2e-16 ***
## speechiness 39.57966 15.28264 2.590 0.009642 **
## tempo -0.12300 0.04024 -3.057 0.002255 **
## valence -25.15503 5.77332 -4.357 1.36e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 55.61 on 3482 degrees of freedom
## Multiple R-squared: 0.4143, Adjusted R-squared: 0.4108
## F-statistic: 117.3 on 21 and 3482 DF, p-value: < 2.2e-16
check assumptions with diagnostic plots
#par(mfrow = c(4,1))
plot(ml0)
Residuals vs. Fitted : Does not show a horizontal line. There is a noticeable pattern of a slight curve in the trend.
Normal QQ: Residual points roughly follow the normal QQ line. The left side falls below the normal line below theoretical quantile -2. Otherwise, the distribution of the residuals appears to roughly follow a normal distribution.
*Scale-Location: Due to the clear decreasing line rather thana flat horizontal line of equally spread points, there is clear evidence of violation for homogeneity of the variance of the residuals. May need to run the model on a transformation of the outcome variable which is the months of song release.
qqnorm(ml0$residuals)
qqline(ml0$residuals, col = "red")
checking for multicollinearity
car::vif(ml0)
## GVIF Df GVIF^(1/(2*Df))
## popularity 1.166795 1 1.080183
## duration 1.095607 1 1.046713
## acousticness 1.572984 1 1.254187
## danceability 1.463926 1 1.209928
## energy 2.625882 1 1.620457
## instrumentalness 1.062607 1 1.030828
## key 1.085836 11 1.003750
## loudness 1.974944 1 1.405327
## speechiness 1.078396 1 1.038458
## tempo 1.112227 1 1.054622
## valence 1.540583 1 1.241202
Overall, the current model does not meet the assumptions of the linear model. We need to make a transformation on the response variable to achieve normality of the residuals
yhat.ml0 = predict(ml0, newdata = test0)
data.frame(
RMSE = RMSE(yhat.ml0, test0$months),
R2 = R2(yhat.ml0, test0$months)
)
## RMSE R2
## 1 56.76951 0.3780801
boxcox(ml0,lambda = seq(1.7, 3, 0.1), plotit = TRUE)
Optimal transformation is for \(\lambda = 2.3\)
hist(bc.transform(data$months, 2.3))
ml0.bc <- lm(bc.transform(months, 2.3) ~. , data = train0)
summary(ml0.bc)
##
## Call:
## lm(formula = bc.transform(months, 2.3) ~ ., data = train0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -182515 -45410 3662 45295 209577
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 387391.90 19370.62 19.999 < 2e-16 ***
## popularity 2137.24 62.36 34.275 < 2e-16 ***
## duration -22523.94 2084.80 -10.804 < 2e-16 ***
## acousticness 33787.57 7558.99 4.470 8.08e-06 ***
## danceability -61484.25 12091.30 -5.085 3.87e-07 ***
## energy -72490.65 12454.57 -5.820 6.40e-09 ***
## instrumentalness 79343.44 16134.42 4.918 9.16e-07 ***
## key1 -1387.89 5518.61 -0.251 0.80145
## key2 -5870.47 6832.21 -0.859 0.39027
## key3 -6175.19 7641.41 -0.808 0.41908
## key4 -15890.01 5415.81 -2.934 0.00337 **
## key5 -4046.15 5251.36 -0.770 0.44106
## key6 -6788.43 5483.33 -1.238 0.21580
## key7 670.53 6008.66 0.112 0.91115
## key8 661.37 6490.90 0.102 0.91885
## key9 -15530.42 5565.41 -2.791 0.00529 **
## key10 -15579.34 5728.48 -2.720 0.00657 **
## key11 -12658.78 5115.65 -2.475 0.01339 *
## loudness 11349.58 778.26 14.583 < 2e-16 ***
## speechiness 37433.06 17534.00 2.135 0.03284 *
## tempo -128.82 46.17 -2.790 0.00530 **
## valence -33700.33 6623.82 -5.088 3.81e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 63800 on 3482 degrees of freedom
## Multiple R-squared: 0.4159, Adjusted R-squared: 0.4124
## F-statistic: 118.1 on 21 and 3482 DF, p-value: < 2.2e-16
check diagnostic plots
plot(ml0.bc)
normality has improved but nothing else ...
yhat.ml0.bc = predict(ml0.bc, newdata = test0 %>% mutate(months = bc.transform(test0$months, 2.3)))
data.frame(
RMSE = RMSE(yhat.ml0.bc, bc.transform(test0$months, 2.3)),
R2 = R2(yhat.ml0.bc, bc.transform(test0$months, 2.3))
)
## RMSE R2
## 1 64627.64 0.3899321
hist(log(360-train0$months))
summary(log(360-train0$months))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.398 3.732 4.304 4.261 4.796 5.878
ml0.log360 <- lm(log(360-train0$months) ~. , data = train0)
summary(ml0.log360)
##
## Call:
## lm(formula = log(360 - train0$months) ~ ., data = train0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.39175 -0.40146 0.04915 0.44693 1.61319
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.7389301 0.1848139 14.820 < 2e-16 ***
## popularity -0.0215638 0.0005949 -36.246 < 2e-16 ***
## duration 0.1940970 0.0198910 9.758 < 2e-16 ***
## acousticness -0.2401500 0.0721199 -3.330 0.000878 ***
## danceability 0.4237910 0.1153624 3.674 0.000243 ***
## energy 0.5720095 0.1188284 4.814 1.54e-06 ***
## instrumentalness -0.8481059 0.1539376 -5.509 3.86e-08 ***
## key1 -0.0127793 0.0526528 -0.243 0.808245
## key2 0.0157264 0.0651857 0.241 0.809372
## key3 0.0354738 0.0729063 0.487 0.626596
## key4 0.1181835 0.0516720 2.287 0.022245 *
## key5 0.0170423 0.0501029 0.340 0.733768
## key6 0.0505054 0.0523161 0.965 0.334417
## key7 -0.0435365 0.0573283 -0.759 0.447650
## key8 -0.0555081 0.0619293 -0.896 0.370147
## key9 0.1087090 0.0530993 2.047 0.040706 *
## key10 0.1264746 0.0546551 2.314 0.020723 *
## key11 0.1012854 0.0488081 2.075 0.038044 *
## loudness -0.0871796 0.0074253 -11.741 < 2e-16 ***
## speechiness -0.3296733 0.1672909 -1.971 0.048842 *
## tempo 0.0009236 0.0004405 2.097 0.036072 *
## valence 0.3243115 0.0631975 5.132 3.03e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6087 on 3482 degrees of freedom
## Multiple R-squared: 0.4105, Adjusted R-squared: 0.4069
## F-statistic: 115.4 on 21 and 3482 DF, p-value: < 2.2e-16
check diagnostic plots
plot(ml0.log360)
Predictions and performance diagnostics
yhat.ml0.log360 = predict(ml0.log360, newdata = test0 %>% mutate(months = log(360 - test0$months)))
data.frame(
RMSE = RMSE(yhat.ml0.log360, log(360 - test0$months)),
R2 = R2(yhat.ml0.log360, log(360 - test0$months))
)
## RMSE R2
## 1 0.5976752 0.3932124
hist(sqrt(350-train0$months))
summary(sqrt(350-train0$months))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 5.635 8.000 8.377 10.536 18.628
ml0.sqrt350 <- lm(sqrt(350-train0$months) ~. , data = train0)
summary(ml0.sqrt350)
##
## Call:
## lm(formula = sqrt(350 - train0$months) ~ ., data = train0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.4296 -2.0157 -0.0706 2.0165 9.4299
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.352232 0.877636 -1.541 0.12346
## popularity -0.097745 0.002825 -34.598 < 2e-16 ***
## duration 1.011761 0.094457 10.711 < 2e-16 ***
## acousticness -1.411834 0.342480 -4.122 3.84e-05 ***
## danceability 2.630235 0.547828 4.801 1.64e-06 ***
## energy 3.783006 0.564287 6.704 2.36e-11 ***
## instrumentalness -3.941710 0.731012 -5.392 7.43e-08 ***
## key1 0.040263 0.250035 0.161 0.87208
## key2 0.235396 0.309551 0.760 0.44704
## key3 0.212147 0.346214 0.613 0.54007
## key4 0.712502 0.245377 2.904 0.00371 **
## key5 0.187106 0.237926 0.786 0.43169
## key6 0.315464 0.248436 1.270 0.20424
## key7 -0.036510 0.272238 -0.134 0.89332
## key8 -0.112283 0.294087 -0.382 0.70263
## key9 0.678401 0.252156 2.690 0.00717 **
## key10 0.660914 0.259544 2.546 0.01093 *
## key11 0.589000 0.231778 2.541 0.01109 *
## loudness -0.560716 0.035261 -15.902 < 2e-16 ***
## speechiness -1.819528 0.794423 -2.290 0.02206 *
## tempo 0.005484 0.002092 2.622 0.00879 **
## valence 1.494620 0.300109 4.980 6.66e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.89 on 3482 degrees of freedom
## Multiple R-squared: 0.4249, Adjusted R-squared: 0.4214
## F-statistic: 122.5 on 21 and 3482 DF, p-value: < 2.2e-16
check diagnostic plots
plot(ml0.sqrt350)
Predictions and performance diagnostics
yhat.ml0.sqrt350 = predict(ml0.sqrt350, newdata = test0 %>% dplyr::mutate(months = sqrt(350-test0$months)))
data.frame(
RMSE = RMSE(yhat.ml0.sqrt350, sqrt(350-test0$months)),
R2 = R2(yhat.ml0.sqrt350, sqrt(350-test0$months))
)
## RMSE R2
## 1 2.894936 0.3988485